This notebook performs Exploratory Data Analysis on the motor vehicle crashes data aggregated with the ACS data
df <- read_csv("../data/processed/acs_mvc_combined_2018_2023.csv")
glimpse(df)
Rows: 13,984
Columns: 63
$ geoid <dbl> 3.6005e+10, 3.6005e+10, 3.6005e+10, 3.6005e+10, 3.6005e…
$ year <dbl> 2018, 2019, 2020, 2021, 2022, 2023, 2018, 2019, 2020, 2…
$ total_population <dbl> 47900.000, 46630.000, 44881.000, 45523.000, 30541.000, …
$ pct_male_population <dbl> 13.576200, 13.607120, 13.691763, 13.568965, 13.581743, …
$ pct_female_population <dbl> 1.2045929, 1.1130174, 1.0137920, 1.0631988, 0.9757375, …
$ pct_white_population <dbl> 3.7014614, 4.7887626, 5.7039727, 5.8871340, 6.2145968, …
$ pct_black_population <dbl> 8.849687, 8.200729, 7.666941, 7.187576, 6.758128, 6.613…
$ pct_asian_population <dbl> 0.26096033, 0.38816213, 0.38100755, 0.38881445, 0.40273…
$ pct_hispanic_population <dbl> 4.862213, 5.161913, 5.158085, 4.758034, 3.837464, 3.557…
$ pct_foreign_born <dbl> 2.206681, 2.365430, 2.319467, 2.220855, 1.990767, 1.626…
$ pct_age_under_18 <dbl> 0.3569937, 0.2208878, 0.2161271, 0.2152758, 0.1899086, …
$ pct_age_18_34 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_age_35_64 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_age_65_plus <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ median_income <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 58582.658, 49…
$ pct_income_under_25k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_income_25k_75k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_income_75k_plus <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_below_poverty <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_above_poverty <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ poverty_rate <dbl> NA, NA, NA, NA, NA, NA, 22.633201, 22.440424, 21.358291…
$ median_gross_rent <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1579.11…
$ pct_owner_occupied <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_renter_occupied <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_no_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_one_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_two_plus_vehicles <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_less_than_hs <dbl> 6.3924843, 6.0261634, 5.7017446, 5.6608747, 4.8950591, …
$ pct_hs_diploma <dbl> 2.0229645, 2.7364358, 3.0547448, 3.4619862, 3.8996758, …
$ pct_some_college <dbl> 1.2526096, 1.1237401, 1.0405294, 0.9797245, 1.2540519, …
$ pct_associates_degree <dbl> 0.1691023, 0.1737079, 0.2339520, 0.2723898, 0.2521201, …
$ pct_bachelors_degree <dbl> 0.14822547, 0.15440703, 0.18938972, 0.23724271, 0.11132…
$ pct_graduate_degree <dbl> 0.01670146, 0.01930088, 0.04233417, 0.05272060, 0.13424…
$ pct_in_labor_force <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_employed <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_unemployed <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_not_in_labor_force <dbl> 14.780793, 14.720137, 14.705555, 14.632164, 14.557480, …
$ unemployment_rate <dbl> NA, NA, NA, NA, NA, NA, 15.750133, 13.478747, 10.806175…
$ pct_commute_short <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_commute_medium <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_commute_long <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_drive_alone <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_carpool <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_public_transit <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_walk <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_bike <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_work_from_home <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ total_crashes <dbl> 2, 4, 0, 0, 0, 0, 43, 30, 31, 30, 17, 20, 48, 45, 39, 2…
$ persons_injured <dbl> 1, 2, 0, 0, 0, 0, 11, 8, 12, 12, 10, 10, 10, 18, 13, 3,…
$ persons_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ pedestrians_injured <dbl> 0, 0, 0, 0, 0, 0, 2, 2, 3, 2, 4, 1, 3, 0, 2, 0, 1, 1, 1…
$ pedestrians_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ cyclists_injured <dbl> 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 1, 1, 2, 1, 0, 0, 3, 0…
$ cyclists_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ motorists_injured <dbl> 1, 2, 0, 0, 0, 0, 9, 4, 8, 10, 4, 8, 6, 16, 10, 3, 12, …
$ motorists_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ crash_rate_per_1000 <dbl> 0.04175365, 0.08578169, 0.00000000, 0.00000000, 0.00000…
$ injury_rate_per_1000 <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000000, 0.00000…
$ fatality_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pedestrian_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ cyclist_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ motorist_rate_per_1000 <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000000, 0.00000…
$ injury_fatality_ratio <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, NA, NA,…
df <- df %>%
mutate(
geoid = as.character(geoid),
year = as.integer(year),
total_population = as.integer(total_population),
total_crashes = as.integer(total_crashes),
persons_injured = as.integer(persons_injured),
persons_killed = as.integer(persons_killed),
pedestrians_injured = as.integer(pedestrians_injured),
pedestrians_killed = as.integer(pedestrians_killed),
cyclists_injured = as.integer(cyclists_injured),
cyclists_killed = as.integer(cyclists_killed),
motorists_injured = as.integer(motorists_injured),
motorists_killed = as.integer(motorists_killed)
)
glimpse(df)
Rows: 13,984
Columns: 63
$ geoid <chr> "36005000100", "36005000100", "36005000100", "360050001…
$ year <int> 2018, 2019, 2020, 2021, 2022, 2023, 2018, 2019, 2020, 2…
$ total_population <int> 47900, 46630, 44881, 45523, 30541, 24345, 51349, 50967,…
$ pct_male_population <dbl> 13.576200, 13.607120, 13.691763, 13.568965, 13.581743, …
$ pct_female_population <dbl> 1.2045929, 1.1130174, 1.0137920, 1.0631988, 0.9757375, …
$ pct_white_population <dbl> 3.7014614, 4.7887626, 5.7039727, 5.8871340, 6.2145968, …
$ pct_black_population <dbl> 8.849687, 8.200729, 7.666941, 7.187576, 6.758128, 6.613…
$ pct_asian_population <dbl> 0.26096033, 0.38816213, 0.38100755, 0.38881445, 0.40273…
$ pct_hispanic_population <dbl> 4.862213, 5.161913, 5.158085, 4.758034, 3.837464, 3.557…
$ pct_foreign_born <dbl> 2.206681, 2.365430, 2.319467, 2.220855, 1.990767, 1.626…
$ pct_age_under_18 <dbl> 0.3569937, 0.2208878, 0.2161271, 0.2152758, 0.1899086, …
$ pct_age_18_34 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_age_35_64 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_age_65_plus <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ median_income <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 58582.658, 49…
$ pct_income_under_25k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_income_25k_75k <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_income_75k_plus <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_below_poverty <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_above_poverty <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ poverty_rate <dbl> NA, NA, NA, NA, NA, NA, 22.633201, 22.440424, 21.358291…
$ median_gross_rent <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1579.11…
$ pct_owner_occupied <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_renter_occupied <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_no_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_one_vehicle <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_two_plus_vehicles <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_less_than_hs <dbl> 6.3924843, 6.0261634, 5.7017446, 5.6608747, 4.8950591, …
$ pct_hs_diploma <dbl> 2.0229645, 2.7364358, 3.0547448, 3.4619862, 3.8996758, …
$ pct_some_college <dbl> 1.2526096, 1.1237401, 1.0405294, 0.9797245, 1.2540519, …
$ pct_associates_degree <dbl> 0.1691023, 0.1737079, 0.2339520, 0.2723898, 0.2521201, …
$ pct_bachelors_degree <dbl> 0.14822547, 0.15440703, 0.18938972, 0.23724271, 0.11132…
$ pct_graduate_degree <dbl> 0.01670146, 0.01930088, 0.04233417, 0.05272060, 0.13424…
$ pct_in_labor_force <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_employed <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_unemployed <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_not_in_labor_force <dbl> 14.780793, 14.720137, 14.705555, 14.632164, 14.557480, …
$ unemployment_rate <dbl> NA, NA, NA, NA, NA, NA, 15.750133, 13.478747, 10.806175…
$ pct_commute_short <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_commute_medium <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
$ pct_commute_long <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_drive_alone <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_carpool <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_public_transit <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ pct_walk <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_bike <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pct_work_from_home <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ total_crashes <int> 2, 4, 0, 0, 0, 0, 43, 30, 31, 30, 17, 20, 48, 45, 39, 2…
$ persons_injured <int> 1, 2, 0, 0, 0, 0, 11, 8, 12, 12, 10, 10, 10, 18, 13, 3,…
$ persons_killed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ pedestrians_injured <int> 0, 0, 0, 0, 0, 0, 2, 2, 3, 2, 4, 1, 3, 0, 2, 0, 1, 1, 1…
$ pedestrians_killed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ cyclists_injured <int> 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 1, 1, 2, 1, 0, 0, 3, 0…
$ cyclists_killed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ motorists_injured <int> 1, 2, 0, 0, 0, 0, 9, 4, 8, 10, 4, 8, 6, 16, 10, 3, 12, …
$ motorists_killed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ crash_rate_per_1000 <dbl> 0.04175365, 0.08578169, 0.00000000, 0.00000000, 0.00000…
$ injury_rate_per_1000 <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000000, 0.00000…
$ fatality_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ pedestrian_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ cyclist_rate_per_1000 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000…
$ motorist_rate_per_1000 <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000000, 0.00000…
$ injury_fatality_ratio <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, NA, NA,…
# Visualize missingness
vis_miss(df)
# Count NA values per column
colSums(is.na(df))
geoid year total_population
0 0 0
pct_male_population pct_female_population pct_white_population
426 426 426
pct_black_population pct_asian_population pct_hispanic_population
426 426 426
pct_foreign_born pct_age_under_18 pct_age_18_34
426 426 426
pct_age_35_64 pct_age_65_plus median_income
426 426 0
pct_income_under_25k pct_income_25k_75k pct_income_75k_plus
426 426 426
pct_below_poverty pct_above_poverty poverty_rate
426 426 455
median_gross_rent pct_owner_occupied pct_renter_occupied
0 426 426
pct_no_vehicle pct_one_vehicle pct_two_plus_vehicles
426 426 426
pct_less_than_hs pct_hs_diploma pct_some_college
426 426 426
pct_associates_degree pct_bachelors_degree pct_graduate_degree
426 426 426
pct_in_labor_force pct_employed pct_unemployed
426 426 426
pct_not_in_labor_force unemployment_rate pct_commute_short
426 464 426
pct_commute_medium pct_commute_long pct_drive_alone
426 426 426
pct_carpool pct_public_transit pct_walk
426 426 426
pct_bike pct_work_from_home total_crashes
426 426 0
persons_injured persons_killed pedestrians_injured
0 0 0
pedestrians_killed cyclists_injured cyclists_killed
0 0 0
motorists_injured motorists_killed crash_rate_per_1000
0 0 426
injury_rate_per_1000 fatality_rate_per_1000 pedestrian_rate_per_1000
426 426 426
cyclist_rate_per_1000 motorist_rate_per_1000 injury_fatality_ratio
426 426 12721
Identify and remove 426 rows with extensive missingness
df$na_rate <- rowMeans(is.na(df))
# Filter out rows with high missingness
df <- df %>%
filter(rowMeans(is.na(.)) <= 0.05)
# Drop 29 rows with poverty_rate NA and/or 38 unemployment_rate NA values
df <- df %>%
drop_na(c(unemployment_rate, poverty_rate))
# Drop na_rate column
df <- df %>% select(-na_rate, -injury_fatality_ratio)
Second quality check
# Visualize missingness
vis_miss(df)
NA
NA
NO NA values!
replace_top_99_with_median <- function(x) {
# Ensure numeric
if (!is.numeric(x)) stop("Column must be numeric.")
# Calculate the 99th percentile and median
upper_threshold <- quantile(x, 0.99, na.rm = TRUE)
median_val <- median(x, na.rm = TRUE)
# Replace values above the 99th percentile with the median
x[x > upper_threshold] <- median_val
return(x)
}
per_1000_vars <- grep("per_1000", names(df), value = TRUE)
df[per_1000_vars] <- lapply(df[per_1000_vars], replace_top_99_with_median)
write_csv(df, "../data/model-ready/social-risk-crash-rate-data.csv")